## The Role of Case Management in Misdemeanor Prosecution
## Lindsay Graef and Aurelie Ouss
##
## BUILD DATASET
## This script creates our main analysis dataset for the project.
## It also creates a table showing the progression of the sample build.

library(tidytable)
library(tidyverse)
library(daotools)
library(arrow)
library(modeest)
library(lubridate)
library(dplyr)
library(NIBRSFunction)
library(data.table)
library(odbc)
library(dbplyr)
library(tools)
library(tictoc)
library(daocore)
library(DBI)
library(conflicted)
library(kableExtra)
 
# Set conflict resolution for functions
conflicts_prefer(tidytable::filter(),
                 tidytable::arrange(),
                 tidytable::mutate(),
                 tidytable::rename(),
                 tidytable::left_join(),
                 tidytable::distinct(),
                 tidytable::summarize(),
                 tidytable::select(),
                 tidytable::case_when(),
                 tidytable::ifelse(),
                 tidytable::pivot_longer(),
                 tidytable::pivot_wider(),
                 tidytable::bind_rows(),
                 tidytable::group_by(),
                 tidytable::separate(),
                 tidytable::ungroup(),
                 tidytable::`%in%`,
                 tidytable::lead(),
                 tidytable::lag(),
                 tidytable::fill(),
                 lubridate::year(),
                 lubridate::month(),
                 lubridate::week(),
                 lubridate::wday()
)

# set seed for replication
set.seed(52285)


#-------------------------------------------------------------------------------#
# FUNCTIONS --------------------------------------------------------------------#
#-------------------------------------------------------------------------------#


## Get Basic Merged Dataset from AOPC & DAOCMS----------------------------------

# Function to merge case_listings and docket_entries data
merge_cl_de <- function(){
  # Get case_listings data
  load_case_listings()
  
  # Get docket_entries data
  docket_entries <- arrow::open_dataset("/srv/data/aopc/cache/docket_entries/philadelphia_docket_entry_data.parquet") %>%
    collect() %>%
    select(-c(county_or_court_num,case_type_tag))
  
  # make all docket entry comments lower case, remove "." and ","
  docket_entries$docket_entry_comments <- docket_entries$docket_entry_comments %>%
    str_to_lower() %>%
    { gsub("\\.|,","",.) }
  
  # Merge docket_entries with case_listings dataset:
  # make sure date variables match
  docket_entries$filed_date <- gsub(" [0-9]{2}:[0-9]{2}:[0-9]{2}","",docket_entries$filed_date)
  docket_entries$filed_date <- ymd(docket_entries$filed_date)
  
  # *add flags for docket_entries vs. case_listings observations (to understand merge)
  docket_entries$de_flag <- 1
  case_listings$cl_flag <- 1
  
  # set up colnames for tidytable join (automatically will merge on common colnames)
  colnames(docket_entries)[2] <- "listing_date"
  
  merged <- full_join.(case_listings, docket_entries)
  
  # remove columns we don't need & # filter to just disposed dockets, MC or CP
  merged <- merged %>%
    select(-c(judge_participant_id,judge_first,judge_last,listing_month,
               dc_district,ada_participant_id,is_case_active,def_atty_participant_id,
               event_disp_reason_id,unit_type_id,disposition_type_id,listing_type_id)) %>%
    filter((startsWith(docket_number,"MC51CR")|startsWith(docket_number,"CP51CR")) & 
              !is.na(disposition_date))
  
  # fill in zeros for missing cl_flag and de_flag
  i <- which(is.na(merged$de_flag))
  merged$de_flag[i] <- 0
  i <- which(is.na(merged$cl_flag))
  merged$cl_flag[i] <- 0
  
  return(merged)
}


## Get Shared Cases
# Keep just cases for which the docket_number appears in BOTH the case_listings and 
# docket_entries dataframes (DAOCMS and AOPC). Get cases after 2010 when DAOCMS is reliable
getSharedCases <- function(df){
  
  df <- df %>%
    mutate(de_docket = ifelse(any(de_flag==1),1,0),
            cl_docket = ifelse(any(cl_flag==1),1,0), .by = docket_number) %>%
    filter(de_docket ==1 & cl_docket == 1) %>%
    select(-c(de_docket, cl_docket, de_flag, cl_flag))
  
  # remove any listings that are in the future
  df <- df %>%
    filter(listing_date <= today())
  
  return(df)
}



## Getting Cases and Case Listings ---------------------------------------------


## Collapse Docket Entries and Deduplicate
# collapses docket entry text on same date in the same courtroom for a given docket
# also collapses official docket entries in same manner
collapseDocketEntries <- function(df){
  
  # create id_dlc variable to get unique docket_number-listing-courtrooms
  df <- df %>%
    mutate(id_dlc = paste(docket_number,listing_date,court_room_num), 
            .by = c(docket_number, listing_date,court_room_num)) 
  
  # collapse "official docket entry" column into one (paste together for each listing)
  df <- df %>%
    mutate(official_docket_entry_collapse = paste(official_docket_entry,collapse = " | "),
            docket_entry_comments_collapse = paste(docket_entry_comments,collapse = " | "),
            .by = id_dlc) %>%
    # remove duplicates
    select(-c(official_docket_entry,docket_entry_comments)) %>%
    distinct(id_dlc, .keep_all = TRUE)
  
  return(df)
}



## Get Relevant Hearings
# Removes bail hearings and post-disposition hearings (e.g., probation hearings, 
# payment plan conferences, etc.) Keeps sentencing / disposition hearing. 
# Calculates expected "end_date" of case, then for cases that have a Gagnon Hearing 
# or VOP hearing, we want to keep that hearing if it comes BEFORE the Trial or 
# Sentencing date. Otherwise, want to remove it.
# Finally, calculates listing number. Need this for other functions, but then need to 
# update listing number again later (after removing diversion hearings, etc.)
getRelevantHearings <- function(df){
  
  # remove hearings after disposition date
  df <- df %>%
    filter(listing_date <= disposition_date)
  
  ## Remove probation and diversion status hearings after sentencing and/or trial date
  df <- df %>%
    # calculate likely end date 
    mutate(end_date = case_when(grepl("(?<!Probation\\b)Sentencing", listing_type_cd, 
                                        ignore.case = T, perl = T) ~ listing_date,
                                  (grepl("defendant sentenced", docket_entry_comments_collapse, ignore.case = T)
                                   & !grepl("Violation|\\bVOP|Gagnon", listing_type_cd, ignore.case = T)) ~ listing_date,
                                  grepl("Trial", listing_type_cd, ignore.case = T) ~ 
                                    listing_date,
                                  # if none of the above, check for "sentenced" keyword
                                  (grepl("sentenced", docket_entry_comments_collapse, ignore.case = T)
                                   & !grepl("Violation|\\bVOP|Gagnon", listing_type_cd, ignore.case = T)) ~ listing_date,
                                  (grepl("sentence", docket_entry_comments_collapse, ignore.case = T)
                                   & !grepl("Violation|\\bVOP|Gagnon", listing_type_cd, ignore.case = T)) ~ listing_date,
                                  TRUE ~ date("1111-11-11"))) %>% 
    mutate(end_date = max(end_date, na.rm = T), .by = docket_number) %>%
    # remove probation hearings and diversion status hearings after likely end date
    mutate(remove_indicator = ifelse(grepl("Violation|\\bVOP|Gagnon|PCRA|PDC Status",
                                             listing_type_cd, ignore.case = T) 
                                       & listing_date > end_date, 1, 0)) %>%
    filter(remove_indicator == 0 | end_date == "1111-11-11") %>%
    # keep those w/o end date since we are unsure of how hearings look here (small # of cases)
    select(-remove_indicator)
  
  
  # also remove bail hearings (unless it looks like there's something more happening there)
  # need to simplify NA docket entry comments first
  df$docket_entry_comments_collapse <- gsub("^NA [| NA]+$", NA, df$docket_entry_comments_collapse)
  
  # remove if type is preliminary arraignment AND there are no docket entry notes (means no ADA name)
  df <- df %>%
    filter(!(listing_type_cd %in% c("Preliminary Arraignment","Arraignment",
                                     "Emergency Release Hearing",
                                     "Bail Hearing","Early Bail Review Status",
                                     "Arraignment Preliminary Hearing") 
              & is.na(docket_entry_comments_collapse)))
  
  # see if those that are left make sense to keep or should be removed
  prelims <- df %>%
    filter(listing_type_cd %in% c("Preliminary Arraignment","Arraignment",
                                   "Emergency Release Hearing",
                                   "Bail Hearing","Early Bail Review Status",
                                   "Arraignment Preliminary Hearing")) %>%
    # keep those with an "ada" identified, otherwise remove
    mutate(keep = ifelse(grepl("ada",docket_entry_comments_collapse),1,0)) %>%
    filter(keep == 1) %>%
    select(-keep) %>%
    filter(listing_type_cd != "Early Bail Review Status") %>% # want to remove all of these
    # also remove if bail hearing is in the basement (not MC room)
    filter(!(listing_type_cd %in% c("Preliminary Arraignment","Arraignment",
                                     "Emergency Release Hearing",
                                     "Bail Hearing","Early Bail Review Status",
                                     "Arraignment Preliminary Hearing") &
                court_room_num == "B08"))
  
  # filter out remaining arraignments, then add back the few that we need to keep
  df <- df %>%
    filter(!(listing_type_cd %in% c("Preliminary Arraignment","Arraignment",
                                     "Emergency Release Hearing",
                                     "Bail Hearing","Early Bail Review Status",
                                     "Arraignment Preliminary Hearing") )) %>%
    # then add back the small number of preliminary arraignments we identified to keep 
    rbind(.,prelims) 
  
  # drop first status hearing in room 404 and any consecutive 404 hearings before case is sent to MC trial room
  df <- df %>%
    arrange(docket_number, listing_date) %>%
    mutate(listing_number = row_number(), .by = docket_number) %>%
    # ID first 404 listing at beginning of case
    mutate(start_flag = ifelse(court_room_num == 404 & listing_number == 1, 1, 0)) %>%
    # then mark any subsequent 404 listings at beginning of case to drop
    mutate(drop_flag = case_when(start_flag == 1 ~ 1,
                                   court_room_num != 404 & lag(court_room_num == 404) ~ 0,
                                   TRUE ~ NA)) %>%
    fill(drop_flag, .direction = "down") %>%
    filter(drop_flag != 1) %>%
    select(-drop_flag, -start_flag)
  
  return(df)
}


## Get Lead Charges
# Uses version of the complaint charge table that's available in DAO tools
# gets lead charge grade for case (at charging) and creates summary lead charge variable
getLeadCharges <- function(df){
  
  # compare to daotools version of complaint charge dataset
  cc <- get_complaint_charge_df()
  
  # first need to get defendant PID
  load_arrests() 
  
  cc <- cc %>%
    # add in defendant PIDs from PARS
    left_join(., pars_arrests %>% select(arrest_id, dc_number, defendant_pid, arrest_date),
               by = c("dc_number","arrest_id")) %>%
    # add in docket numbers
    left_join(., df %>% select(docket_number, dc_number, defendant_pid),
               by = c("dc_number", "defendant_pid"))
  
  # get lead charge grade by docket and keep just lead charge grade per case 
  cc <- cc %>%
    mutate(lead_charge_grade_atCU_levels = case_when(any(grepl("H1", charge_grade)) ~ "H1",
                                                       any(grepl("H2", charge_grade)) ~ "H2",
                                                       any(grepl("H3", charge_grade)) ~ "H3",
                                                       any(grepl("H", charge_grade)) ~ "H",
                                                       any(grepl("F1", charge_grade)) ~ "F1",
                                                       any(grepl("F2", charge_grade)) ~ "F2",
                                                       any(grepl("F3", charge_grade)) ~ "F3",
                                                       any(grepl("F", charge_grade)) ~ "F",
                                                       any(grepl("M1", charge_grade)) ~ "M1",
                                                       any(grepl("M2", charge_grade)) ~ "M2",
                                                       any(grepl("M3", charge_grade)) ~ "M3",
                                                       any(grepl("M", charge_grade)) ~ "M",
                                                       any(grepl("S", charge_grade)) ~ "S"),
            .by = docket_number) %>%
    distinct(docket_number, lead_charge_grade_atCU_levels, defendant_pid, dc_number) %>%
    # also get a summary version of lead charge grade
    mutate(lead_charge_grade_atCU = case_when(any(grepl("H", lead_charge_grade_atCU_levels)) ~ "H",
                                                any(grepl("F", lead_charge_grade_atCU_levels)) ~ "F",
                                                any(grepl("M", lead_charge_grade_atCU_levels)) ~ "M",
                                                any(grepl("S", lead_charge_grade_atCU_levels)) ~ "S"), 
            .by = docket_number) %>%
    select(docket_number, lead_charge_grade_atCU_levels, lead_charge_grade_atCU) 
  
  # now merge lead charge info onto main dataset build
  df <- left_join(df, cc, by = c("docket_number"))
  
  return(df)
}



## Get Case Variables
# Pulls in some needed variables from the cases dataframe that weren't in case listings
# including police district, case open date, etc.
getCasesVars <- function(df){
  
  load_cases()
  
  cases <- cases %>%
    select(docket_number, arrest_date, dc_number, dc_district, case_open_date, 
            num_charges, lead_charge_grade_category, dispo_charge_grade_category)
  
  df <- df %>%
    left_join(., cases) %>%
    mutate(dc_district = as.factor(str_trim(dc_district))) 
  
  return(df)
}




# Correct Diversion Cases-------------------------------------------------------


## Remove Cases Diverted at Charging
# identifies and removes MC cases that were diverted at charging
removeChargingDiversions <- function(df){
  
  # get misdemeanor diversion cases
  diverted <- df %>%
    filter(dispo_type == "Diversion" & startsWith(docket_number,"MC51CR"))
  
  # If the first hearing after arraignment was "Status", "Progress Listing", or
  # "TC Status", etc. case was likely diverted at charging and goes directly to program
  # Otherwise, if there is a trial hearing scheduled first, it was diverted by MC ADA.
  diverted <- diverted %>%
    arrange(docket_entry_comments_collapse) %>%
    distinct(docket_number, listing_date, listing_type_cd, .keep_all = T) %>%
    arrange(listing_date) %>%
    filter(!listing_type_cd %in% c("Preliminary Arraignment", "Arraignment","Bail Hearing",
                                    "Arraignment Preliminary Hearing","Emergency Release Hearing",
                                    "Early Bail Review")) %>% # remove bail hearings
    mutate(listing_number = row_number(), .by = docket_number) %>%
    # look at first post-bail listing for each docket number to see if it's a diversion listing
    filter(listing_number == 1) %>%
    mutate(CU_divert = ifelse(listing_type_cd %in% c("Status","Progress Listing","IP Status",
                                                       "TC - Progress Listing","DV Diversion Status",
                                                       "TC - Status","TC - Status Open",
                                                       "Summary Diversion Program","Status Listing",
                                                       "Status - Community Court","PDC Status",
                                                       "PDC Progress","DAP Status","DAP Progress")|
                        grepl("AMP|ARD|Accele?ratr?ed Misde?m?ean?m?or|Program Accept|ard program|TCY",
                                        docket_entry_comments_collapse, ignore.case = T) |
                        court_room_num %in% c("Broad and Champlost","Academy & Red Lion",
                                              "Community Court","Mental Health Clinic",
                                              "3901 Whitaker Avenue","55th and Pine Streets") , 1, 0))
  
  # get list of CU diversion cases
  cu_divert <- diverted %>%
    filter(CU_divert == 1)
  
  # remove CU diversions from main dataset and return
  df <- df %>%
    filter(!docket_number %in% cu_divert$docket_number)
  
  return(df)
}


## Correct Diversion Dispositions
# Checks for diversion cases that were marked with "dismissed / withdrawn" disposition type. 
# Checks that those cases were successfully completed diversion; creates a new updated
# dispo_type variable that reflects this.
correctDiversionDispos <- function(df){
  
  # identify possible diversion cases
  # some of these are clearly failures (guilty pleas, etc.) but it's the dismissed / withdrawn 
  # cases that are concerning, since these may have been successful diversions (if they
  # show up as plea, etc., they either failed diversion or did not actually enter diversion)
  possible_diversion_cases <- df %>%
    filter(any(grepl("Community Court|ARC|ARD|PDC|Status of Admission|DV|Diversion|TC|MHC|",
                      listing_type_cd)), .by = docket_number) %>%  
    filter(dispo_type %in% c("Dismissed/Withdrawn/Etc","Withdrawn in the Interest of Justice")) %>%
    select(docket_number,listing_date,listing_desc,listing_type_cd,listing_outcome,
            docket_entry_comments_collapse,official_docket_entry_collapse,
            dispo_type,case_disposition_type,listing_number)
  
  successes <- possible_diversion_cases %>%
    # look at last listing in case to see if it indicates diversion completion
    filter(listing_number == max(listing_number), .by = docket_number) %>%
    filter(grepl("conditions satisfied|(defendant )?completed|compliance",docket_entry_comments_collapse))
  
  # Update disposition for the cases that were successes
  df <- df %>%
    mutate(dispo_type_update = as.factor(ifelse(docket_number %in% successes$docket_number,
                                                  "Diversion",as.character(dispo_type))))
  return(df)
}


## Get Diversion End Date
# Calculates best guess of date that a case enters diversion (for cases that successfully 
# complete diversion). Then drops all hearings after the diversion entry date, and resets
# disposition date to reflect diversion entry date.
getDiversionEndDate <- function(df){
  
  # find first listing where we see "approved|accepted" for diversion
  temp <- df %>%
    filter(dispo_type_update =="Diversion") %>%
    select(docket_number, listing_date, disposition_date,docket_entry_comments_collapse,
            listing_number, dispo_type, dispo_type_update, official_docket_entry_collapse,
            listing_type_cd) %>%
    mutate(diversion_entry = ifelse((grepl("defendant approved|defendant accepted(?! service)|
                              treatment accepted|(ard|amp|tc|mhc|pdc|diversion) approved|defendant entered",docket_entry_comments_collapse, ignore.case=T, perl=T) | grepl("Order Granting Motion to Admit Defendant into Veterans Court|Order Granting Motion to Admit Defendant into AMP 2|Order Granting Motion to Admit Defendant into Accelerated Misdemeanor Program|Order Granting Acceptance|Defendant Admitted",official_docket_entry_collapse) | grepl("^ARD|[[:space:]]ARD$|[[:space:]]ARD[[:space:]]|Accelerated Rehab", official_docket_entry_collapse, ignore.case = TRUE) | grepl("Diversion Status", listing_type_cd)),1,0)) %>%
    mutate(entry_listing = as.numeric(row_number() == min(row_number()[diversion_entry == 1])), 
            .by = docket_number)
  
  # put listing date as entry date if identified as entry listing (otherwise will be NA)
  i <- which(temp$entry_listing==1)
  temp$entry_date <- NA_Date_
  temp[i,]$entry_date <- temp[i,]$listing_date
  
  # get diversion entry date
  temp <- temp %>%
    mutate(diversion_entry_date = max(entry_date, na.rm = TRUE), .by = docket_number) %>%
    distinct(docket_number, listing_date, diversion_entry_date) 
  
  # Now add entry dates back onto main dataframe
  df <- df %>%
    left_join(., temp) %>%
    # remove listings after diversion entry date
    filter(is.na(diversion_entry_date) | 
              (!is.na(diversion_entry_date) & listing_date <= diversion_entry_date) ) %>%
    # create updated disposition date variable that sets dispo date to diversion entry date
    mutate(disposition_date_update = ifelse(!is.na(diversion_entry_date), diversion_entry_date,
                                              disposition_date))
    
  return(df)
}


## Remove all cases diverted at charging, regardless of whether they are successful or not. 
# Just look at the first hearing after bail / 404 and see if it is in Diversion courtroom. Then remove these cases.
dropDiversionsAlternate <- function(df){
  
  started_diversion <- df %>%
    arrange(listing_date) %>%
    mutate(listing_number = row_number(), .by = docket_number) %>%
    # look at first post-bail / 404 listing to see if it's in a Diversion courtroom
    filter(listing_number == 1) %>%
    mutate(divert_start = ifelse(court_room_num %in% c("Broad and Champlost","Academy & Red Lion",
                                                         "Community Court","Mental Health Clinic",
                                                         "3901 Whitaker Avenue",
                                                         "55th and Pine Streets"), 1, 0)) %>%
    filter(divert_start == 1)
  
  # remove CU diversions from main dataset and return
  df <- df %>%
    filter(!docket_number %in% started_diversion$docket_number)
  
  return(df)
}

## Get Sent to 404 Variable
# Creates a flag for when MC Unit (or non-charging) ADA sends a case to 404 status 
# room. Also creates a category called "404 Withdrawn" for cases that are sent to 
# 404 to be withdrawn / dismissed / etc. I've already dropped all cases that get
# resolved in 404 from the get-go, so this is just cases that went to MC Unit and
# then got sent back.
get404Vars <- function(df){
  
  end404 <- df %>%
    filter(listing_number == max(listing_number), .by = docket_number) %>%
    filter(court_room_num == 404)
  
  df <- df %>%
    arrange(docket_number, listing_date) %>%
    mutate(sent_to_404 = ifelse(lead(court_room_num==404) & court_room_num != 404, 1, 0)) %>%
    mutate(dispo_type_update = case_when(docket_number %in% end404$docket_number &
                                             grepl("Withdrawn|Diversion", dispo_type) ~ "404 Withdrawn",
                                           grepl("Withdrawn in the Interest", dispo_type) ~
                                             "Dismissed/Withdrawn/Etc",
                                           TRUE ~ dispo_type))
  return(df)
}



# FTA Functions-----------------------------------------------------------------


## Get FTA instances
# takes dataframe and column of text to search, exports new dataframe with FTA columns
# Note: dataframe must include the "event_disp_reason" variable from DAOCMS
getFTA <- function(df, docket_entry_colname){
  
  df <- df %>%
    # Police FTA (officers, detectives, state troopers)
    mutate(officer_fta = ifelse(grepl("ofta|did not subpoena (a )?(necessary )?(police )?officer|(trooper|(?<!prevention )off?r?icer|police|police office|ofcr|detective|sergeant|sargea?nt|inspector|lieutenant|corporal|captain|chief|[^l]po|p/o)\\'?s? ?[[:punct:]]?(is?n?|was|has|being)? ?(out)? ?(o/?n|at)? ?(fta|failed ?to ?appe?a?r|fai?led ?to ?app?r?ear?|failed ?to ?appre|unavail(able)?|not ?available|left|not present|not here|sick|ill|busy|vaca?t?i?o?n?|out of town|a?t?(family)? ?funeral|iod|injured on duty|hospital|training|trng|not checked in|released by mistake|not subpoena?e?d?|subpoena fail|had to leave|(has?d? )?(an? )?(dr'?s|doctors?)? appointment|fraternity leave|maternity leave|(has?d? )?(a )?family emergency|military leave)",df[[docket_entry_colname]], perl = T),1,0)) %>%
    
    # LPO / Private Security Guard FTA
    mutate(lpo_fta = ifelse(grepl("(loss ?prevention ?offf?icer|lpo|security guard|security officer)\\'?s? ?[[:punct:]]?(is?n?|was|has|being)? ?(out)? ?(o/?n|at)? ?(fta|failed ?to ?appe?a?r|fai?led ?to ?app?r?ear?|failed ?to ?appre|unavail(able)?|not ?available|not present|not here|sick|ill|busy|vaca?t?i?o?n?|out of town|a?t?(family)? ?funeral|iod|injured on duty|hospital|training|released by mistake|not subpoena?e?d?|subpoena fail|had to leave|(has?d? )?(an? )?(dr'?s|doctors?)? appointment|fraternity leave|maternity leave|(has?d? )?(a )?family emergency|military leave)",df[[docket_entry_colname]], perl = T),1,0)) %>%
    
    # Victim / Complaining Witness FTA
    mutate(victim_fta = ifelse(grepl("vfta|cw[[:punct:]]? ?fta|cw unavailable|(victim|c/w|complai?nt?(ing)? with?ness?|compl(ainant)?( witness?)?|\\bcwit(ness)?)e?s? ?[[:punct:]]?(are|is?n?|has|was|were)? ?(out)? ?(o/?n|at)? ?(fta|failed ?to ?appe?a?r|fai?led ?to ?app?r?ear?|failed ?to ?appre|unavail(able)?|not ?available|not present|not here|out of town|in rehab|sick|ill|busy|vaca?t?i?o?n?|a?t?(family)? ?funeral|(still )?(in )?hospital|in custody|not subpoena?e?d?|subpoena fail|had to leave|(has?d? )?(an? )?(dr'?s|doctors?)? appointment|fraternity leave|maternity leave|(has?d? )?(a )?family emergency|(has )?(relocated|moved))",df[[docket_entry_colname]], perl = T),1,0)) %>%
    
    # Witness FTA
    mutate(witness_fta = ifelse(grepl("[^c]witt?n?/? ?fta|[^c]w/?fta|did not subpoena (the )?witness|((?<!complainant |complaint |complain |complaining|compl )(wit(ness)?|wit?h?ness))e?\\'?s?s? ?[[:punct:]]?(are|is?n?|has|was|were)? ?(out)? ?(o/?n|at)?(notified-?)? ?(fta|failed ?to ?appe?a?r|fai?led ?to ?app?r?ear?|failed ?to ?appre|unavail(able)?|not ?available|not present|not here|out of town|in rehab|sick|ill|busy|vaca?t?i?o?n?|a?t?(family)? ?funeral|(still )?(in )?hospital|in custody|not subpoena?e?d?|subpoena fail|had to leave|(has?d? )?(an? )?(dr'?s|doctors?)? appointment|fraternity leave|maternity leave|(needed )?emergency surgery|(has?d? )?(a )?family emergency|(has )?(relocated|moved))",df[[docket_entry_colname]],perl = T),1,0)) %>%
    
    # Defendant FTA
    mutate(defendant_fta = ifelse(grepl("dfta|(defenda?e?nt|dfndt|deft?|dft)\\'?s? ?[[:punct:]]?(who)?(are|is?n?|has|was|were)? ?(out)? ?(o/?n|at)? ?(fta|failed ?to ?appe?a?r|fai?led ?to ?app?r?ear?|failed ?to ?appre|unavail(able)?|not ?available|not ?present(?! ?[[:punct:]]?in custody)|not here|out of town|in rehab|sick|ill|busy|vaca?t?i?o?n?|a?t?(family)? ?funeral|(still )?in hospital|not subpoena?e?d?|subpoena fail|had to leave|(has?d? )?(an? )?(dr'?s|doctors?)? appointment|fraternity leave|maternity leave|(has?d? )?(a )?family emergency|(has )?(relocated|moved))", df[[docket_entry_colname]],perl = T),1,0)) %>%
    
    # Defense Attorney FTA
    mutate(defenseatty_fta = ifelse(grepl("(?<!district )(att?t?orney|attt?y|counsel|pd|defense|esq)\\'?s? ?[[:punct:]]?(is?n?|was)? ?(out)? ?(o/?n|at)? ?(fta|failed ?to ?appe?a?r|fai?led ?to ?app?r?ear?|failed ?to ?appre|unavail(able)?|not ?available|not present|not here|out of town|sick|ill|busy|vaca?t?i?o?n?|a?t?(family)? ?funeral|in hospital|injured|had to leave|(has?d? )?(an? )?(dr'?s|doctors?)? appointment|fraternity leave|maternity leave|(has?d? )?(a )?family emergency|on trial|in (juvenile|family|fed(eral)?|traffic) court|in (another room|another c(oun)?ty|camden|bucks cty|del ?co|del cty|mont co))", df[[docket_entry_colname]],perl = TRUE),1,0)) %>%
    # **probably more to clean here: look for "atty in" and see which ones to exclude
    
    # ADA FTA
    mutate(ada_fta = ifelse(grepl("(district ?attorney|ada|district ?atty)\\'?s? ?(is?n?|was)? ?(out)? ?(o/?n|at)? ?(fta|failed ?to ?appe?a?r|fai?led ?to ?app?r?ear?|failed ?to ?appre|unavail(able)?|not ?available|not present|not here|out of town|sick|ill|injured|busy|vaca?t?i?o?n?|a?t?(family)? ?funeral|in hospital|had to leave|(has?d? )?(an? )?(dr'?s|doctors?)? appointment|fraternity leave|maternity leave|(has?d? )?(a )?family emergency|on trial|in fed(eral)? court)", df[[docket_entry_colname]], perl = T),1,0)) %>%
    
    # Other FTAs (catch other actors not listed above / help catch terms missed above)
    # adds "other fta" flag if all other FTA types are missing but we still catch FTA language
    mutate(other_fta = ifelse(officer_fta==0 & lpo_fta==0 & witness_fta==0 & victim_fta==0 & defendant_fta==0 & defenseatty_fta==0 & ada_fta==0 & grepl("fta|failed ?to ?appe?a?r|fai?led ?to ?app?r?ear?|failed ?to ?appre|(?<!discovery |equiment |testimony |notes |reports )unavail(able)|not ?here|not ?present", df[[docket_entry_colname]],perl = T),1,0)) %>%
    
    # Commonwealth not ready
    mutate(cw_notready = ifelse(grepl("cw ?nr|cw ?- ?nr|cw ?/ ?nr|cw: ?nr|(cw|commonwealth) ?(is?n? )?(was ?)?not ready|(cw|commonwealth) ?[[:punct:]]?\\s*not ?ready", df[[docket_entry_colname]]),1,0)) %>%
    
    # Defense not ready
    mutate(defense_notready = ifelse(grepl("dnr|d-nr(?!immer)|def/? ?nr|(def|defens?c?e) ?(is?n? )?(was ?)?not ready|(def|defens?c?e) ?[[:punct:]]?\\s*not ?ready", df[[docket_entry_colname]],perl = TRUE),1,0)) %>%
    
    # "Must Be Tried" designations
    mutate(marked_mbt = ifelse(grepl("mbt|must ?be ?tr?i?e?d|must ?be ?ti?r?e?d|no (further )?continuances", df[[docket_entry_colname]]),1,0))
  
  # get index of FTA variables
  vars <- which(grepl("fta$",names(df)))
  # replace any NAs in the FTA columns with zeros
  for(v in vars){
    i <- which(is.na(df[[v]]))
    if(length(i) > 0){
      df[i][[v]] <- 0
    }
  }
  
  # FINALIZE FTA VARIABLES 
  # Combine information from case listings with greps above (sometimes each catches something the other missed)
  df <- df %>%
    mutate(ada_fta = ifelse(ada_fta==1 | grepl("((DA|prosecut).*(atty|attorney|ADA).*(unavailable|fta|failure|sick|vacation|busy))", event_disp_reason, ignore.case = T),1,0),
            defenseatty_fta = ifelse(defenseatty_fta==1 | grepl("(Defense.*(atty|attorney).*(unavailable|fta|failure|sick|vacation|busy))", event_disp_reason, ignore.case = T),1,0),
            witness_fta = ifelse(witness_fta==1 |event_disp_reason == "Defense - Witness Unavailable" | grepl("((DA|prosecut).*(witness).*(unavailable|fta|failure|sick|vacation|busy)($|(?! - police)))", event_disp_reason, ignore.case = T, perl = T),1,0),
            victim_fta = ifelse(victim_fta==1 | grepl("((DA|prosecut).*(victim).*(unavailable|fta|failure|sick|vacation|busy))", event_disp_reason, ignore.case = T),1,0),
            officer_fta = ifelse(officer_fta==1 |  event_disp_reason == "Prosecution Witness Unavailable - Police" | grepl("((DA|prosecut).*(police|officer).*(unavailable|fta|failure|sick|vacation|busy|training))",event_disp_reason, ignore.case = T),1,0),
            defendant_fta = ifelse(defendant_fta==1 | grepl("(Defense.*(defendant|def).*(unavailable|fta|failure|sick|busy|arrived late))", event_disp_reason, ignore.case = T),1,0)) %>%
    # also build a witness FTA combo variable that includes LPO FTAs and witness FTAs
    mutate(witness_fta_combo = ifelse(witness_fta == 1|lpo_fta == 1, 1, 0))
  
  return(df)
}



## Get Witness FTA Type (cw or defense)
# uses context from docket_entry_comments and event_disp_reason to decide if
# a witness FTA is attributable to the commonwealth or defense. For example, if there is a 
# witness FTA but docket entry says "commonwealth not ready" or "defense ready" then 
# it is CW witness, etc.
getWitnessFTAType <- function(df){
  
  df <- df %>%
    mutate(witness_fta_type = case_when(witness_fta==1 & (grepl("ada wit fta|cw witn? fta|defen?s?e ready", docket_entry_comments_collapse) | (cw_notready==1 & defense_notready==0) | grepl("DA - Victim/Witness|DA - Witness|Prosecution Witness Unavailable - DA", event_disp_reason) | grepl("commonwealth witness|commonwealth request ?-* ?witness|cw necessary witness|commonwealth request continuance witnes|commonwealth\\'s request;? witness",docket_entry_comments_collapse)) ~ "Commonwealth", 
                                          witness_fta==1 & (defense_notready==1 | grepl("Defense - Witness", event_disp_reason) | grepl("defense witn?(ess)? ?(unav|fail|fta)|defense request witness ?(unav|fta|fail)",docket_entry_comments_collapse)) ~ "Defense", 
                                          witness_fta==0 ~ "No Witness FTA",
                                          TRUE ~ "Unclear"))
  
  return(df)
}


## Update victim FTAs
# We think the steno may sometimes mark a general witness FTA when it really is a victim
# FTA. Gets instances where there was a witness FTA recorded but no witness ID'd on
# the case and there is a victim ID'd on the case but no victim FTA. 
updateVictimFTAs <- function(df){
  
  # pull out misclassified FTAs
  temp <- df %>%
    filter(witness_fta==1 & witness_case==0 & victim_fta==0 & victim_case==1)
  
  # mark these hearings as victim FTA instead of witness FTA
  df[df$id_dlc %in% temp$id_dlc,]$victim_fta <- 1
  df[df$id_dlc %in% temp$id_dlc,]$witness_fta <- 0
  
  # also need to update the case-level FTA variables
  df <- df %>%
    mutate(any_victim_fta = ifelse(any(victim_fta==1),1,0), 
            any_witness_fta = ifelse(any(witness_fta==1)|any(lpo_fta==1),1,0),
            .by = docket_number)
  
  return(df)
}


# Cleaning / Variable Creation Functions----------------------------------------

## Clean PA Statute variables
# separates out title, section, and subsection into new variables before using NIBRS function
cleanStatutes <- function(df){
  
  df$statute_title <- str_extract(df$lead_charge_offense_code_with_subsection,"^[0-9]+")
  df$statute_section <- str_extract(df$lead_charge_offense_code_with_subsection,"[^0-9] [0-9]+-?.?([0-9]+)?") 
  df$statute_subsection <- str_extract(df$lead_charge_offense_code_with_subsection,"[^0-9]([A-Z]*[0-9]*[0-9A-Z]?)$") 
  # remove the squigglies, remove "_" at end of some sections
  df$statute_subsection <- str_extract(df$statute_subsection, "([A-Z]*[0-9]*[0-9A-Z]?)$")
  df$statute_section <- gsub("[^0-9] ([0-9]+-?.?([0-9]+)?)","\\1",df$statute_section) 
  df$statute_section <- gsub("_$","",df$statute_section) 
  
  return(df)
}


## Get Crime Categories
# Creates variable categorizing offenses by NIBRS and UCR codes
# Note: run this after the NIBRS function
getCrimeCat <- function(df){
  
  df <- df %>%
    mutate(crimebigcat = case_when(
      grepl("Driving Under", nibrs_desc)|grepl("Driving Under", ucr_desc) ~ "DUI",
      grepl("Assault|Weapon|Sex|Manslaughter|Human", ucr_desc)|
        grepl("Kidnapping|Reckless Endangerment|Rape|Robbery|Sexual Assault", nibrs_desc)|
        grepl("Make Repairs/Sell/Etc Offens Weap|Poss Instrument Of Crime W/Int|Corpse|Bomb Threat|Propel Missile Into Occ Vehicles|Poss. Firearm In Ct. Facility For Use In Crime|Uses Incapacitation Device", lead_charge_description) ~ "Violent",
      grepl("Drug", ucr_desc)|
        grepl("Drug", nibrs_desc)|
        # note: many "controlled substance" are DUIs; important to keep DUI cat at top of code
        grepl("Controlled Substance|Disp Cont Subst|Furnishing Drug Free Urine|Know Dist Sche I Or Ii Contd Subs|Sale Give Contr Subs To Dep Person|Smell/Inhale Toxic Releasing Substances|Adult/Muti/Dest Label",lead_charge_description) ~ "Drugs",
      grepl("Disorderly|Liquor|Vagrancy|Prostitution|Drunkenness", ucr_desc)|
        grepl("Trespass|Obscene|Drunkenness", nibrs_desc)|
        grepl("Corruption Of Minors|Deposit Trash On Street|Dogs Not Validly Registered|Intentional Desecration Of Public Monument|Operating Watercraft|Scatter Rubbish|Tattooing Minor", lead_charge_description) ~ "Public Order",
      grepl("Stolen|theft|Theft|Vandalism|Fraud|Forgery", ucr_desc)|
        grepl("Bad Checks|Contraband|Burglary|Arson", nibrs_desc)|
        grepl("Conceal Info Affecting Benefits|Conspiracy|Unauthorized Sale/Transfer Of Tickets|Unlawful Use Recording Device In Theater|Conceal Info Affecting Benefits|Recorde?d? Device", lead_charge_description) ~ "Property",
      TRUE ~ "Other"))
  
  return(df)
}


## Get Lead Offense at Charging 
# Pulls lead charge at charging from Julia's BWC code, merges to main build,
# then creates crimebigcat_charging variable of crime category at charging.
# Removes granular charging variables when finished.
getLeadOffAtCharging <- function(df){
  
  # load charging info from BWC datasets
  bwc <- readRDS("/srv/data/penn/bwc_jr/bwc_data_inputs_dc_pid.rds") %>%
    select(dc_pid, starts_with("charging_")) 
  
  df <- df %>%
    mutate(dc_pid = paste(dc_number, defendant_pid, sep = ";")) %>%
    left_join(., bwc, by = c("dc_pid")) %>%
    # build new crime category variable for charging lead charge
    mutate(crimebigcat_charging = case_when(
      `charging_Driving Under the Influence` == 1 ~ "DUI",
      
      `charging_Aggravated Assault` == 1|`charging_Murder/Manslaughter`== 1 |
        `charging_Rape/Sex Offense` == 1 | charging_Robbery == 1 | `charging_Simple Assault` == 1 | 
        charging_Weapons == 1 ~ "Violent",
      
      `charging_Drug PWID, Manufacturing, Sales` == 1 | `charging_Drug Possession` == 1 ~ "Drugs",
      
      charging_Drunkenness == 1 | charging_Prostitution == 1 |
        `charging_Public Order` == 1 ~ "Public Order",
      
      charging_Property == 1 | charging_Shoplifting == 1 | 
        `charging_Burglary/Breaking and Entering` == 1 ~ "Property",
      TRUE ~ "Other")) %>%
    # remove unnecessary columns (just keep high-level crime categories at charging)
    select(-c(starts_with("charging")))
  
  return(df)
}


## Create Case-level FTA variables 
# Note: this at the docket level (if add felonies, need to switch back to cross court ID)
getCaseFTAInfo <- function(df){
  
  df <- df %>%
    mutate(listing_fta_count = rowSums(.[,c("officer_fta","witness_fta","victim_fta","defendant_fta","defenseatty_fta","lpo_fta","ada_fta","other_fta")],na.rm = T)) %>%
    mutate(listing_has_fta = ifelse(listing_fta_count > 0, 1, 0)) %>%
    mutate(ftas_per_case = sum(listing_fta_count),
           any_fta = ifelse(any(listing_fta_count > 0),1,0), 
           any_victim_fta = ifelse(any(victim_fta==1),1,0), 
           any_witness_fta = ifelse(any(witness_fta==1)|any(lpo_fta==1),1,0),
           any_officer_fta = ifelse(any(officer_fta==1),1,0),
           any_lpo_fta = ifelse(any(lpo_fta==1),1,0),
           any_def_fta = ifelse(any(defendant_fta==1),1,0),
           any_defatty_fta = ifelse(any(defenseatty_fta==1),1,0),
           any_ada_fta = ifelse(any(ada_fta==1),1,0),
           any_other_fta = ifelse(any(other_fta==1),1,0),
           any_cw_fta = ifelse(any_victim_fta==1|any_witness_fta==1|any_officer_fta==1|any_lpo_fta==1,1,0),
           any_non_def_fta = ifelse(any_victim_fta==1|any_witness_fta==1|
                                    any_officer_fta==1|any_lpo_fta==1|any_defatty_fta==1|
                                    any_ada_fta==1|any_other_fta==1,1,0),
            .by = docket_number)
  
  return(df)
}


## Get Attorney Info
# Gets defense attorney type. Note these are imperfect matches given that defense 
# attorneys can change a lot throughout a case
getAttyInfo <- function(df){
  
  load_participants_sum()
  
  participants_sum <- participants_sum %>%
    select(docket_number,defendant_pid,defendant_attorney_type)
  
  df <- df %>%
    left_join(., participants_sum) %>%
    mutate(atty_type_cat = case_when(defendant_attorney_type == "Private" ~ "Private",
                                     grepl("Court Appointed|Private",defendant_attorney_type) ~ "Court Appointed Private",
                                     grepl("Public Defender|Federal Defender",defendant_attorney_type) ~ "Public Defender",
                                       TRUE ~ NA_character_)) %>%
    mutate(def_atty_pd = ifelse(atty_type_cat == "Public Defender", 1, 0))
  
  return(df)
}



## Get Victim Info
# Gets available victim information. Note that this is imperfect identification
# of victims; data collected and entered by victim witness coordinators. 
# Use victim last name to get n() of cases with victims.
getVictimInfo <- function(df){
  
  load_participants_sum()
  
  participants_sum <- participants_sum %>%
    select(docket_number,defendant_pid,victim_last_name,victim_first_name,victim_gender,
            victim_ethnicity_cd,victim_birth_date) %>%
    mutate(victim_first_name = str_to_lower(victim_first_name),
            victim_last_name = str_to_lower(victim_last_name),
            victim_birth_date = ymd(victim_birth_date))
  
  df <- df %>%
    left_join(., participants_sum) %>%
    mutate(victim_case = ifelse(any(!is.na(victim_last_name))|any(!is.na(victim_first_name)),1,0),
            .by = docket_number) %>%
    mutate(victim_age = as.numeric(difftime(arrest_date, victim_birth_date)/365),
            victim_gendercat = case_when(victim_gender=="M" ~ "Male",
                                          victim_gender=="F" ~ "Female",
                                          victim_gender=="T" ~ "Transgender",
                                          TRUE ~ NA_character_)) %>%
    mutate(victim_racecat = case_when(victim_ethnicity_cd == "Latinx" ~ "Latinx",
                                        victim_ethnicity_cd == "Black" ~ "Black",
                                        victim_ethnicity_cd == "White" ~ "White",
                                        grepl("Asian|Native", victim_ethnicity_cd) ~ "Other",
                                        TRUE ~ NA_character_))
  return(df)
}



## Get Witness Info
# Gets indicator for whether there is a witness identified on a docket.
# Note this is imperfect; unclear who enters this information. There is some limited
# gender, age, and ethnicity information if we want to add that in later.
getWitnessInfo <- function(df){
  
  load_participants()
  
  witnesses <- participants %>%
    select(docket_number, first_name, last_name, gender, participant_type) %>%
    filter(grepl("Witness|Crime Scene Property Owner",participant_type)) %>%
    rename(witness_first_name = first_name,
            witness_last_name = last_name,
            witness_gender = gender) %>%
    mutate(witness_case = ifelse(!is.na(witness_last_name)|!is.na(witness_first_name)
                                   |!is.na(witness_gender),1,0)) %>%
    distinct(docket_number, .keep_all = TRUE) %>%
    select(docket_number, witness_case)
  
  df <- left_join(df, witnesses)
  
  return(df)
}



## Get Defendant Info
# Gets defendant demographic info from PARS, which is more reliable for race / ethnicity
# Looks like cases has better info for defendant age / gender
getDefendantInfo <- function(df){
  
  # get defendant ethnicity variables from PARS
  load_arrests()
  
  pars_arrests <- pars_arrests %>%
    select(dc_number,defendant_pid,defendant_race) %>%
    rename(defendant_race_PARS = defendant_race) %>%
    distinct(dc_number,defendant_pid, .keep_all = TRUE)
  
  df <- df %>%
    left_join(., pars_arrests) %>%
    # build simplified race / ethnicity categories
    mutate(def_racecat = case_when(grepl("Latino",defendant_race_PARS) ~ "Latinx Any Race",
                                     grepl("Black", defendant_race_PARS) ~ "Black",
                                     defendant_race_PARS == "White" ~ "White",
                                     defendant_race_PARS == "Unknown" ~ NA_character_,
                                     TRUE ~ "Other")) %>%
    mutate(def_racecat = as.factor(def_racecat)) %>%
    # get defendant age at arrest and gender category from DAOCMS variables
    mutate(arrest_year = as.factor(year(arrest_date)),
           def_age_arrest = as.numeric(difftime(arrest_date, defendant_date_of_birth))/365) %>%
    mutate(def_gendercat = case_when(defendant_gender=="M" ~ "Male",
                                     defendant_gender=="F" ~ "Female",
                                     defendant_gender=="T" ~ "Transgender",
                                       TRUE ~ NA_character_)) 
  return(df)
}




# Get Recidivism and Priors Variables-------------------------------------------



## Get Recidivism (new case)
# Gets variable indicating if new case was opened within 2 years of case open date
# Note we are using MC cases to calculate this so we can only look at recid from
# open date. If we want to adapt for CP / to look from dispo date, need to use merged 
# MC and CP cases.
getRecid <- function(df){
  
  # need to use all cases, not just cases in our misdemeanors dataset
  load_cases()
  
  recid <- cases %>%
    filter(startsWith(docket_number, "MC51CR")) %>%  # keep just MC cases, all cases start in MC
    mutate(docket_year = str_extract(docket_number, "[0-9]{4}$")) %>%
    select(docket_number,docket_year,case_open_date,disposition_date,
            defendant_pid,otn,dc_number) %>%
    filter(!is.na(defendant_pid)) %>%  # can only determine priors for cases that have PID
    distinct(docket_number, .keep_all = T) %>% 
    mutate(defendant_pid = trimws(defendant_pid)) %>%
    filter(defendant_pid != "" & defendant_pid != 0) %>% # remove weird PIDs
    # Distinct by otn-date because may have several cases opened on same day
    distinct(defendant_pid, otn, .keep_all = T) %>%
    # collapse OTN by date (will use to merge later, but want distinct dates for now)
    mutate(otn = paste(otn, collapse = ";"), .by = c(defendant_pid, case_open_date)) %>%
    arrange(defendant_pid, case_open_date) %>%
    # keep first case open date per OTN (may be multiple cases opened on same day for same OTN)
    distinct(defendant_pid, case_open_date, .keep_all = T) %>%
    select(-c(docket_number, docket_year, dc_number)) %>% # not relevant after distinct otn-pid-date
    arrange(defendant_pid, desc(case_open_date)) %>%
    mutate(number_of_future_offenses = row_number() - 1, .by = defendant_pid)
  
  recid_final <- recid %>%
    arrange(defendant_pid, case_open_date) %>%
    mutate(reciddiff_days_fromOGopen = as.numeric(dplyr::lead(case_open_date) - case_open_date),
            .by = defendant_pid) %>%
    # get recidivism. If time to new case is NA it means there was no new case after the last case
    mutate(recid_win_2yr_fromOGopen = case_when(is.na(reciddiff_days_fromOGopen) ~ 0,
                                                  reciddiff_days_fromOGopen <= 730 &
                                                    reciddiff_days_fromOGopen > 0 ~ 1,
                                                  TRUE ~ 0)) %>%
    # keep just the variables we need (will merge on PID and case open date)
    select(case_open_date, defendant_pid, otn, reciddiff_days_fromOGopen, recid_win_2yr_fromOGopen) %>%
    # get OTNs back so we can merge on open date and OTN
    separate(otn, c("otn1","otn2","otn3","otn4","otn5"), sep = ";") %>%
    pivot_longer(., 
                  cols = otn1:otn5,
                  names_to = "otn_number",
                  values_to = "otn",
                  values_drop_na = TRUE) %>%
    select(-otn_number) 
  
  # Merge back by defendant and case open date
  df <- left_join(df, recid_final, by = c("case_open_date", "defendant_pid", "otn"))
  
  # a few cases (~900) that are NA are because they have a different case open date but
  # the same defendant pid and OTN as a case that was matched (usually within 1 day of the
  # other case open date for same OTN). Assume that they have the same recidivism outcome
  # as the PID-OTN-date match (might just be off by a few days).
  temp <- df %>%
    filter(is.na(recid_win_2yr_fromOGopen)) %>%
    select(-recid_win_2yr_fromOGopen, -reciddiff_days_fromOGopen) %>%
    left_join(., recid_final %>% select(-case_open_date), by = c("otn", "defendant_pid"))
  
  # drop missing rows from main DF and replace with rows from temp
  df <- df %>%
    filter(!is.na(recid_win_2yr_fromOGopen)) %>%
    rbind(., temp)
  
  return(df)
}



## Get Recidivism (new arrest)
# Gets variable indicating if defendant was rearrested within 2 years of case open date
getRecidArrest <- function(df){
  
  load_arrests()
  
  pars_arrests <- pars_arrests %>%
    select(arrest_id, dc_number, defendant_pid, arrest_date, incident_date) %>%
    mutate(arrest_year = year(arrest_date)) %>%
    filter(arrest_year > 2009) %>%
    mutate(defendant_pid = trimws(defendant_pid)) %>%
    filter(!is.na(defendant_pid)) %>%  # can only determine recid for cases that have PID
    filter(defendant_pid != "" & defendant_pid != 0) %>%
    mutate(dc_pid = paste(dc_number, defendant_pid, sep = ";")) %>%
    mutate(pid_arrest_date = paste(defendant_pid, arrest_date, sep = ";")) %>%
    # keep first arrest on dc_pid (rare, but may have 2+ arrests on same dc_pid)
    # We are interested in new "criminal incidents" so makes sense to not count dc_pid twice
    arrange(defendant_pid, arrest_date) %>%
    distinct(dc_pid, .keep_all = TRUE)
  
  rearrest <- pars_arrests %>%
    # keep distinct arrest dates (rare, but may have 2+ arrests on same day)
    distinct(pid_arrest_date, .keep_all = T) %>%
    arrange(defendant_pid, desc(arrest_date)) %>%
    # get number of future arrests
    mutate(number_of_future_arrests = row_number() - 1, .by = defendant_pid) %>%
    arrange(defendant_pid, arrest_date) %>%
    mutate(rearrest_date = lead(arrest_date), .by = defendant_pid) %>%
    select(pid_arrest_date, number_of_future_arrests, rearrest_date)
  
  rearrest <- left_join(pars_arrests, rearrest, by = "pid_arrest_date") %>%
    select(dc_pid, pid_arrest_date, number_of_future_arrests, rearrest_date)
  
  # merge rearrest variables back onto main df by dc_pid
  df <- df %>%
    mutate(dc_pid = paste(dc_number, defendant_pid, sep = ";")) %>%
    left_join(., rearrest) %>%
    # get indicator if defendant had a new arrest within two years of case open date
    mutate(rearrest_days_fromOGopen = as.numeric(rearrest_date - case_open_date)) %>%
    # get rearrests. note if time to rearrest is NA it means there was no new arrest
    mutate(rearrest_win_2yr_fromOGopen = case_when(is.na(rearrest_days_fromOGopen) ~ 0,
                                                    rearrest_days_fromOGopen <= 730 &
                                                      rearrest_days_fromOGopen > 0 ~ 1,
                                                    TRUE ~ 0))
  
  return(df)
}



## Get disposition variables
# Gets the following variables (at docket level): 
# 1) whether defendant was convicted on any violent charge
# 2) whether defendant was convicted on any misdemeanor charge
# 3) whether defendant was convicted on any felony charge
getDispoVars <- function(df){
  
  df <- df %>%
    # see if we can fill in any missing lead charge grade (at sentencing) variables
    mutate(lead_charge_grade_sent = case_when(any(grepl("F",lead_charge_grade)) ~ "F",
                                               any(grepl("H",lead_charge_grade)) ~ "F",
                                               any(grepl("M",lead_charge_grade)) ~ "M",
                                               any(grepl("S",lead_charge_grade)) ~ "S",
                                               any(grepl("IC",lead_charge_grade)) ~ "IC",
                                               TRUE ~ NA_character_),
            .by = docket_number) %>%
    # note this uses crime category and charge grade at sentencing, not charging
    mutate(violent_conviction = ifelse(crimebigcat == "Violent" & dispo_type %in% c("Guilty","Guilty Plea/Nolo"), 1, 0)) %>%
    mutate(misd_conviction = ifelse(dispo_type %in% c("Guilty","Guilty Plea/Nolo") & grepl("M", lead_charge_grade_sent), 1, 0)) %>%
    mutate(fel_conviction = ifelse(dispo_type %in% c("Guilty","Guilty Plea/Nolo") & grepl("F", lead_charge_grade_sent), 1, 0)) %>%
    # make violent conviction, Felony conviction, and misdemeanor conviction case-level variables
    mutate(conviction = ifelse(any(dispo_type %in% c("Guilty","Guilty Plea/Nolo")), 1, 0), 
           any_violent_conviction = ifelse(any(violent_conviction == 1), 1, 0),
           any_misd_conviction = ifelse(any(misd_conviction == 1), 1, 0),
           any_fel_conviction = ifelse(any(fel_conviction == 1), 1, 0),
           .by = docket_number) 
  
  return(df)
}



## Get Cross-case IDs
## Need this for getPriors() function (need to merge MC and CP cases to see dispositions, etc.)
# Loads and prepares list of MC to CP merged cases. Then joins with main dataset.
getCrossCaseIDs <- function(df){
  
  # load linked MC and CP cases dataset
  cp_mc_full <- get(load("/srv/data/penn/cp_mc_links.RData"))
  
  # reorder columns, create "end" docket number column
  cp_mc_full <- cp_mc_full %>%
    select(cross_court_id, docket_number, og_docket_source, og_docket_number) %>%
    mutate(end_docket_number = docket_number) %>%
    # get into format I need to merge
    pivot_longer(.,
                  cols = c(docket_number, og_docket_number),
                  names_to = "name",
                  values_to = "all_docket_numbers") %>%
    distinct(end_docket_number, all_docket_numbers, .keep_all = TRUE) %>%
    mutate(og_docket_source = ifelse(end_docket_number==all_docket_numbers,"self",og_docket_source)) %>%
    select(-name)
  
  # remove duplicate matches before merging with main dataset
  i <- which(duplicated(cp_mc_full$all_docket_numbers))
  d <- cp_mc_full[i,]
  d <- cp_mc_full[cp_mc_full$all_docket_numbers %in% d$all_docket_numbers,]
  # remove duplicates from merge list
  cp_mc_full <- cp_mc_full %>%
    filter(!all_docket_numbers %in% d$all_docket_numbers)
  
  # resolve duplicates to keep
  keep <- d %>%
    mutate(is_agree = ifelse(any(og_docket_source=="agree"),1,0), .by = all_docket_numbers) %>%
    mutate(source_count = n(), .by = c(all_docket_numbers, og_docket_source)) %>%
    # keep agree dockets
    filter(og_docket_source=="agree" & source_count==1) %>%
    select(-c(is_agree,source_count))
  
  # merge agree dockets back onto merge list
  cp_mc_full <- cp_mc_full %>%
    rbind(., keep) %>%
    # drop if end_docket_number is not in df$docket_number (these are cases dropped in getSharedCases())
    filter(end_docket_number %in% df$docket_number)
  
  # join cleaned CP MC merge info onto main dataset
  df <- df %>%
    left_join(., cp_mc_full, by = c("docket_number"="all_docket_numbers")) %>%
    # bring new variables to the front
    select(cross_court_id,end_docket_number,og_docket_source,everything())
  
  return(df)
}

## Get Final Case Disposition
# Gets final disposition type and date for CP cases (they will have multiple dispositions, 
# one for each of their originating MC cases)
getFinalDispo <- function(df){
  
  # pull out dispositions by case and summarize 
  dispos <- df %>%
    select(cross_court_id, end_docket_number, docket_number, dispo_type) %>%
    distinct(end_docket_number, docket_number, dispo_type, .keep_all = TRUE)
  
  cp_dispos <- dispos %>%
    filter(grepl("CP51CR",end_docket_number) & grepl("CP51CR",docket_number))
  
  mc_dispos <- dispos %>%
    filter(grepl("MC51CR",end_docket_number))
  
  dispos <- rbind(cp_dispos, mc_dispos) %>%
    # rename cross-court-ID level disposition variables 
    rename(end_dispo_type = dispo_type) %>%
    select(-docket_number)
  
  # merge back to original df
  df <- df %>%
    left_join(., dispos) %>%
    # get final disposition date by cross-court ID
    mutate(end_disposition_date = max(disposition_date), .by = cross_court_id)
  
  return(df)
}


## Get Priors
# Gets indicators for whether defendant has any prior case, any priors in the past year, 
# prior misdemeanor conviction, prior felony conviction, prior violent conviction, and whether 
# the defendant had a pending case at the time of current case opening.
# df: dataframe of all cases
# Note we are using MC cases to calculate this so we're looking at priors from the previous case 
# open date in MC (not CP). So when we merge back, need to expand variables to the related CP cases.
getPriors <- function(df){
  
  cases <- load_cases() %>%
    mutate(docket_year = str_extract(docket_number, "[0-9]{4}$")) %>%
    # unreliable before 2010, but cut off at 2008 to compare w/ arrests- PARS arrests start in 2008
    filter(docket_year >= 2008) %>% 
    filter(!is.na(defendant_pid)) %>%  # can only determine priors for cases that have PID
    distinct(docket_number, .keep_all = T) %>% 
    mutate(defendant_pid = trimws(defendant_pid)) %>%
    filter(defendant_pid != "" & defendant_pid != 0) %>%
    # Get NIBRS / UCR offense information
    cleanStatutes(.) %>%
    get_nibrs_ucr_cats(as.data.frame(.),
                       title_var = statute_title,
                       section_var = statute_section,
                       subsection_var = statute_subsection,
                       desc_var = lead_charge_description,
                       add_flag = FALSE) %>%
    # get crime categories (at disposition) - need for violent indicators
    getCrimeCat(.) %>%
    # get end disposition date, end docket number (need to look at CP cases to see convictions)
    getCrossCaseIDs(.) %>%
    # fill in cases that did not match to an end docket number (just using this for priors)
    mutate(end_docket_number = ifelse(is.na(end_docket_number), docket_number, end_docket_number)) %>%
    getFinalDispo(.) %>%
    # get indicators for violent, misdemeanor, felony convictions
    # see if we can fill in any missing lead charge grade (at sentencing) variables
    mutate(lead_charge_grade_sent = case_when(any(grepl("F",lead_charge_grade)) ~ "F",
                                               any(grepl("H",lead_charge_grade)) ~ "F",
                                               any(grepl("M",lead_charge_grade)) ~ "M",
                                               any(grepl("S",lead_charge_grade)) ~ "S",
                                               any(grepl("IC",lead_charge_grade)) ~ "IC",
                                               TRUE ~ NA_character_),
            .by = end_docket_number) %>%
    # note this uses crime category and charge grade at sentencing, not charging
    mutate(violent_conviction = ifelse(crimebigcat == "Violent" & dispo_type %in% c("Guilty","Guilty Plea/Nolo"), 1, 0)) %>%
    mutate(misd_conviction = ifelse(dispo_type %in% c("Guilty","Guilty Plea/Nolo") & grepl("M", lead_charge_grade_sent), 1, 0)) %>%
    mutate(fel_conviction = ifelse(dispo_type %in% c("Guilty","Guilty Plea/Nolo") & grepl("F", lead_charge_grade_sent), 1, 0)) %>%
    # make violent conviction, Felony conviction, and misdemeanor conviction case-level variables
    mutate(any_violent_conviction = ifelse(any(violent_conviction == 1), 1, 0),
           any_misd_conviction = ifelse(any(misd_conviction == 1), 1, 0),
           any_fel_conviction = ifelse(any(fel_conviction == 1), 1, 0),
           .by = end_docket_number) %>%
    # get indicator for incarceration sentence
    getSentencesDAOCMS(.) %>%
    rename(incarceration_sentence = any_incarceration)
  
  priors <- cases %>%
    filter(startsWith(docket_number, "MC51CR")) %>%  # keep just MC cases, all cases start in MC
    select(docket_number,case_open_date,end_disposition_date,defendant_pid,otn,
            dc_number,crimebigcat,dispo_type,any_violent_conviction,any_misd_conviction, 
            any_fel_conviction,incarceration_sentence) %>%
    # Distinct by otn because may have several cases opened from same offense, and we are interested in 
    # measuring criminal Hx (if we keep multiple cases on same offense only one may get counted as the prior, etc.)
    # Go by the earliest case open date for a given OTN to calculate priors for that offense
    arrange(defendant_pid, case_open_date) %>%
    distinct(defendant_pid, otn, .keep_all = T) 
  
  # save list of defendant PIDs, OTNs, and case open dates to merge back by
  temp <- priors %>%
    distinct(defendant_pid, case_open_date, otn)
  
  priors_final <- priors %>%
    # get distinct case open dates (may be multiple cases opened on the same day for same OTN)
    distinct(defendant_pid, case_open_date, .keep_all = T) %>%
    select(-c(docket_number, dc_number)) %>%
    arrange(defendant_pid, case_open_date) %>%
    mutate(timediff_days = as.numeric(difftime(case_open_date, dplyr::lag(case_open_date), units = "days")), .by = defendant_pid) %>%
    # don't count it as a prior if multiple cases opened on same day (time diff must be > 0)
    mutate(prior_past_yr = case_when(timediff_days <= 365 & timediff_days > 0 ~ 1,
                                     is.na(timediff_days) ~ 0,
                                     TRUE ~ 0)) %>%
    # also get indicator for ANY priors. first set all "any prior" flags to 1 by defendant, then correct the first open case to 0
    mutate(any_priors = ifelse(any(!is.na(timediff_days)), 1, 0), .by = defendant_pid) %>%
    mutate(any_priors = ifelse(any_priors == 1 & is.na(timediff_days), 0, any_priors)) %>%
    # also want indicator for whether the defendant had a pending case at time of current case opening
    mutate(pending_case_at_open = ifelse(lag(case_open_date) != case_open_date &
                                           lag(end_disposition_date) > case_open_date, 1, 0),
           .by = defendant_pid) %>%
    mutate(pending_case_at_open = ifelse(is.na(pending_case_at_open), 0, pending_case_at_open)) %>%
    # indicators for prior convictions
    mutate(prior_misd_conviction = ifelse(lag(any_misd_conviction) == 1, 1, NA_integer_), .by = defendant_pid) %>%
    mutate(prior_fel_conviction = ifelse(lag(any_fel_conviction) == 1, 1, NA_integer_), .by = defendant_pid) %>%
    mutate(prior_violent_conviction = ifelse(lag(any_violent_conviction) == 1, 1, NA_integer_), .by = defendant_pid) %>%
    mutate(prior_incarceration = ifelse(lag(incarceration_sentence) == 1, 1, NA_integer_), .by = defendant_pid) %>%
    # update: fill() no longer supports .by option, need to group and ungroup df
    group_by(defendant_pid) %>%
    fill(prior_misd_conviction, .direction = "down") %>%
    fill(prior_fel_conviction, .direction = "down") %>%
    fill(prior_violent_conviction, .direction = "down") %>%
    fill(prior_incarceration, .direction = "down") %>%
    ungroup() %>%
    mutate(prior_misd_conviction = ifelse(is.na(prior_misd_conviction), 0, prior_misd_conviction)) %>%
    mutate(prior_fel_conviction = ifelse(is.na(prior_fel_conviction), 0, prior_fel_conviction)) %>%
    mutate(prior_violent_conviction = ifelse(is.na(prior_violent_conviction), 0, prior_violent_conviction)) %>%
    mutate(prior_incarceration = ifelse(is.na(prior_incarceration), 0, prior_incarceration)) %>%
    # keep just the variables we need (will merge on PID and case open date)
    select(case_open_date, defendant_pid, timediff_days, prior_past_yr, any_priors, pending_case_at_open,
           prior_misd_conviction, prior_fel_conviction, prior_violent_conviction, prior_incarceration) %>%
    rename(prior_timediff_days = timediff_days)
  
  # merge OTN back in-- applies priors outcomes to all cases opened on the same day for that defendant
  priors_final <- left_join(priors_final, temp, by = c("case_open_date", "defendant_pid")) %>%
    select(-case_open_date)
  
  # Merge back to main dataset by defendant and OTN, then fill in info for matched CP cases
  df <- left_join(df, priors_final, by = c("otn", "defendant_pid")) %>%
    group_by(docket_number) %>%
    fill(prior_past_yr, .direction = "downup") %>%
    fill(pending_case_at_open, .direction = "downup") %>%
    ungroup() %>%
    arrange(defendant_pid, case_open_date) %>%
    group_by(defendant_pid) %>%
    fill(any_priors, .direction = "down") %>%
    fill(prior_misd_conviction, .direction = "down") %>%
    fill(prior_fel_conviction, .direction = "down") %>%
    fill(prior_violent_conviction, .direction = "down") %>%
    fill(prior_incarceration, .direction = "down") %>%
    ungroup()
  
  # any remaining NAs in priors variables are 0 because either the defendant only had
  # one case, or it was their first case. So fill in these zeros
  df <- df %>%
    mutate(prior_past_yr = ifelse(is.na(prior_past_yr), 0, prior_past_yr)) %>%
    mutate(any_priors = ifelse(is.na(any_priors), 0, any_priors)) %>%
    mutate(pending_case_at_open = ifelse(is.na(pending_case_at_open), 0, pending_case_at_open)) %>%
    mutate(prior_misd_conviction = ifelse(is.na(prior_misd_conviction), 0, prior_misd_conviction)) %>%
    mutate(prior_fel_conviction = ifelse(is.na(prior_fel_conviction), 0, prior_fel_conviction)) %>%
    mutate(prior_violent_conviction = ifelse(is.na(prior_violent_conviction), 0, prior_violent_conviction)) %>%
    mutate(prior_incarceration = ifelse(is.na(prior_incarceration), 0, prior_incarceration)) 
  
  return(df)
}


## Get Prior Arrests
# Gets variable indicating if defendant had an arrest in the past year (for a 
# separate offense / incident). Also gets a version that indicates any prior arrest
getPriorArrest <- function(df){
  
  load_arrests()
  
  pars_arrests <- pars_arrests %>%
    select(arrest_id, dc_number, defendant_pid, arrest_date, incident_date) %>%
    mutate(arrest_year = year(arrest_date)) %>%
    # can only determine recid for cases that have PID
    mutate(defendant_pid = trimws(defendant_pid)) %>%
    filter(!is.na(defendant_pid)) %>%  
    filter(defendant_pid != "" & defendant_pid != 0) %>%
    mutate(dc_pid = paste(dc_number, defendant_pid, sep = ";")) %>%
    mutate(pid_arrest_date = paste(defendant_pid, arrest_date, sep = ";")) %>%
    # keep first arrest on dc_pid (rare, but may have 2+ arrests on same dc_pid)
    # We are interested in new "criminal incidents" so makes sense to not count dc_pid twice
    arrange(defendant_pid, arrest_date) %>%
    distinct(dc_pid, .keep_all = TRUE)
  
  priorarrest <- pars_arrests %>%
    # keep distinct arrest dates (rare, but may have 2+ arrests on same day)
    distinct(pid_arrest_date, .keep_all = T) %>%
    arrange(defendant_pid, arrest_date) %>%
    # get number of future arrests
    mutate(number_of_prior_arrests = row_number() - 1, .by = defendant_pid) %>%
    mutate(priorarrest_date = dplyr::lag(arrest_date), .by = defendant_pid) %>%
    select(pid_arrest_date, number_of_prior_arrests, priorarrest_date)
  
  priorarrest <- left_join(pars_arrests, priorarrest, by = "pid_arrest_date") %>%
    select(dc_pid, pid_arrest_date, number_of_prior_arrests, priorarrest_date)
  
  # merge rearrest variables back onto main df by dc_pid
  df <- df %>%
    mutate(dc_pid = paste(dc_number, defendant_pid, sep = ";")) %>%
    left_join(., priorarrest) %>%
    # get indicator if defendant had a prior arrest within one year of case open date
    mutate(priorarrest_days_fromOGopen = as.numeric(case_open_date - priorarrest_date)) %>%
    # get rearrests. note if time to rearrest is NA it means there was no new arrest
    mutate(priorarrest_past_year = case_when(is.na(priorarrest_days_fromOGopen) ~ 0,
                                               priorarrest_days_fromOGopen <= 365 &
                                                 priorarrest_days_fromOGopen > 0 ~ 1,
                                               TRUE ~ 0)) %>%
    mutate(any_prior_arrest = ifelse(any(!is.na(priorarrest_days_fromOGopen)), 1, 0), .by = defendant_pid)
  
  return(df)
}



# Get ADA and Judge Identifiers-------------------------------------------------

# note that this section creates a hearing-ADA level dataset (e.g., could be 2+ ADAs 
# per hearing, and each will get their own row).


## Get ADA Identifiers Dataset
# Loads and merges master clean dataset and static AOPC dataset 
# Cleans into long dataset of ADA names to merge onto our main build
getADAIdentifiers <- function(){
  
  load('/srv/data/penn/cjactor_names/matching_final/master_clean_wide.rda')
  load('/srv/data/penn/static_ada_with_id_Dec21.rda')
  
  # Merge to master_clean by ID (and docket number)
  adas_dockets_long <- left_join(static_ada_w_id, master_clean, by = c("id", "docket_number")) %>%
    # clean up columns I don't need
    select(-c(docket_entry_comments.x,docket_entry_comments.y,is_sealed,row_n_atty,atty_1,atty_2,atty_3,
               atty_4,atty_5,atty_6,atty_intern,atty_clean,docket_year,id,judge,
               judge_clean,ada_clean)) %>%
    # create a docket-day ID to merge with main build, remove time stamp so I can merge on this
    mutate(docket_day = paste(docket_number, filed_date)) %>%
    mutate(docket_day = gsub(" [0-9]{2}:[0-9]{2}:[0-9]{2}","",docket_day)) %>%
    select(-c(filed_date,official_docket_entry,docket_number,row_n_ada)) %>%
    # remove rows that are all NA and get distinct rows
    filter(!(is.na(ada_1)&is.na(ada_2)&is.na(ada_3)&is.na(ada_4)&is.na(ada_5)&is.na(ada_6))) %>%
    distinct(., .keep_all = T) %>%
    # Want a long dataset with rows by ADA-hearing (list of distinct ADAs per hearing) 
    pivot_longer(.,
                  cols = ada_1:ada_6,
                  names_to = "ada_number",
                  values_to = "ada_name") %>%
    distinct(docket_day, ada_name, .keep_all = T) %>% # get distinct ADAs-per-listing
    filter(!is.na(ada_name)) # remove empty rows
  
  return(adas_dockets_long)
}


## Merge ADA Info
# merges long dataset with cleaned ADA names onto main build
# Note: do this AFTER collapsing docket entry comments, etc.
mergeADAs <- function(df, adas_long){
  
  # merge on docket number and listing date
  df <- df %>%
    mutate(docket_day = paste(docket_number, listing_date)) %>%
    # join with cleaned ada names
    left_join(., adas_long, by = "docket_day") %>%
    # change ADA name to factor
    mutate(ada_name = as.factor(ada_name))
  
  return(df)
} 



## Clean ADA Dockets
# cleans up dataset by removing empty docket entries (there are so many listings
# with "NA" docket entries, I don't think it makes sense to count that as a hearing). 
# Also creates indicator for when ADA is identified at a hearing.
cleanADAdockets <- function(df){
  
  df$docket_entry_comments_collapse <- gsub("^NA ?\\|?( ?NA ?\\|?)*","",df$docket_entry_comments_collapse)
  
  df <- df %>%
    mutate(ada_identified = ifelse(!is.na(ada_name),1,0)) %>%
    mutate(docket_entry_comments_collapse = str_trim(docket_entry_comments_collapse)) %>%
    filter(!is.na(docket_entry_comments_collapse) & docket_entry_comments_collapse != "NA" &
              docket_entry_comments_collapse != "")
  
  return(df)
}


## Create Case Length variables
# Note: this is intended to be run after removing empty listings. Accounts for 
# hearing-by-ADA level dataset and returns listing number based on unique listings.
# NOTE: this uses the dispositon_date_update, which sets diversion entry date as disposition
# date for those cases that got diverted out of MC
getCaseLength <- function(df){
  
  # get case length days variable
  df <- df %>%
    mutate(case_length_days = as.numeric(difftime(disposition_date, case_open_date),
                                          units = "days")) %>%
    mutate(case_length_days_update = as.numeric(difftime(disposition_date_update, case_open_date),
                                          units = "days")) %>%
    # remove old listing number variable!  we'll add updated one below
    select(-listing_number)
  
  # get case length number of listings
  # remove ADA info for this part so we have distinct hearings
  listno <- df %>%
    select(docket_number,listing_date,id_dlc) %>%
    distinct(id_dlc, .keep_all = T) %>%
    mutate(listing_number = row_number(), .by = docket_number) %>%
    mutate(case_length_nol = max(listing_number), .by = docket_number)
  
  # merge new listing numbers back onto main dataset
  df <- left_join(df, listno)
  
  return(df)
}



# helper function to load and rename RDA files 
load_object <- function(file) {
  tmp <- new.env()
  load(file = file, envir = tmp)
  tmp[[ls(tmp)[1]]]
}

## Get Judge Identifiers 
# Loads and merges master clean dataset and static AOPC dataset 
# df: data frame of hearings to get judge names for. Must contain docket_day variable
getJudges <- function(df){
  
  master_clean <- load_object('/srv/data/penn/cjactor_names/matching_final/master_clean_wide.rda')
  static_ada_w_id <- load_object('/srv/data/penn/static_ada_with_id_Dec21.rda')
  
  # Merge to master_clean by ID (and docket number)
  judge_dockets <- left_join(static_ada_w_id, master_clean, by = c("id", "docket_number")) %>%
    # clean up columns I don't need
    select(docket_number, filed_date, docket_entry_comments.x, judge) %>%
    # create a docket-day ID to merge with main build, remove time stamp so I can merge on this
    mutate(docket_day = paste(docket_number, filed_date)) %>%
    mutate(docket_day = gsub(" [0-9]{2}:[0-9]{2}:[0-9]{2}","",docket_day)) %>%
    select(-c(filed_date, docket_entry_comments.x, docket_number)) %>%
    distinct(., .keep_all = T) %>%
    filter(!is.na(judge))
  
  df <- df %>%
    left_join(., judge_dockets, by = "docket_day") %>%
    mutate(judge = as.factor(judge))
  
  return(df)
}


# Get Indicators / misc. Variables for Analyses---------------------------------


## Get Charging variables
# gets extra charging variables for PS estimation, including current 
# number of charges, conspiracy, pic, vufa, and resisting arrest indicators
getChargingVars <- function(df){
  
  # this is the charges DAO actually charged
  cc <- get_complaint_charge_df() %>%
    filter(is_charge_declined == F) %>%
    mutate(number_of_charges = n(), .by = arrest_id) %>%
    distinct(arrest_id, number_of_charges)
  
  charging_df <- readRDS("/srv/data/penn/benchmark/updown_charging_benchmark_data_v4.rds") %>%
    tidytable::select(arrest_id, dc_number, defendant_pid, has_pic_dao, has_conspiracy_dao, 
                      has_vufa_dao, has_resisting_dao) %>%
    left_join(., cc) %>%
    # get distinct dc-pids to merge 
    distinct(.keep_all = TRUE) %>%
    mutate(defendant_pid = str_trim(defendant_pid),
           dc_number = str_trim(dc_number)) %>%
    filter(!is.na(defendant_pid) & !is.na(dc_number) & !is.na(has_pic_dao))
  
  # join with main dataset by dc_number and defendant_pid
  df <- df %>%
    mutate(defendant_pid = str_trim(defendant_pid),
           dc_number = str_trim(dc_number)) %>%
    mutate(temp_id = row_number()) %>%
    left_join(., charging_df, by = c("dc_number","defendant_pid")) %>%
    # there are ~200 instances where a dc-pid has two arrest IDs. but the 
    # info I'm joining in (has pic, num charges, etc.) looks the same
    # regardless of which arrest ID I take as the real one. 
    # So just drop arrest ID, and get distinct rows
    tidytable::select(-arrest_id) %>%
    distinct(temp_id, .keep_all = TRUE)
  
  return(df)
}


## Get Pretrial Release Status
# Gets an indicator for whether the defendant was released from jail within 3 days
getPretrialReleaseStatus <- function(df){
  
  philly_bail <- arrow::open_dataset(daocore::get_data_path("aopc/cache/combined/bail/philadelphia_bail_data.parquet")) %>%
    collect() %>%
    filter(docket_number %in% df$docket_number) %>%
    arrange(docket_number, bail_status_date) %>%
    select(docket_number, bail_status_date, bail_status, bail_type) %>%
    left_join(., df %>% select(docket_number, arrest_date), by = "docket_number") %>%
    mutate(date_diff = difftime(bail_status_date, arrest_date, units = "days")) %>%
    mutate(bail_posted_in_3days = ifelse(any(bail_status == "Posted" & date_diff < 4), 1, 0), .by = docket_number) %>%
    distinct(docket_number, .keep_all = T) %>%
    select(docket_number, bail_posted_in_3days)
  
  df <- df %>%
    left_join(., philly_bail)
  
  return(df)
}


## Get DAOCMS Sentences
# function to pull corrected sentences from DAOCMS
getSentencesDAOCMS <- function(df){
  
  sentences <- load_case_listings_with_sentences() %>%
    select(docket_number, has_probation, has_incarceration) %>%
    mutate(any_probation = ifelse(any(has_probation) == TRUE, 1, 0),
           any_incarceration = ifelse(any(has_incarceration) == TRUE, 1, 0), .by = docket_number) %>%
    distinct(docket_number, .keep_all = TRUE) %>%
    mutate(any_probation = ifelse(is.na(any_probation), 0, any_probation),
           any_incarceration = ifelse(is.na(any_incarceration), 0, any_incarceration), .by = docket_number) %>%
    select(docket_number, any_probation, any_incarceration) %>%
    mutate(probation_only = ifelse(any_probation == 1 & any_incarceration == 0, 1, 0), .by = docket_number) %>%
    mutate(punished = ifelse(any_probation == 1 | any_incarceration == 1, 1, 0), .by = docket_number)
  
  df <- df %>%
    left_join(., sentences) %>%
    mutate(conviction = ifelse(any(dispo_type %in% c("Guilty","Guilty Plea/Nolo")), 1, 0), .by = docket_number) %>%
    mutate(convicted_no_sentence = ifelse(conviction == 1 & punished == 0, 1, 0)) %>%
    mutate(acquitted = ifelse(any(dispo_type == "Not Guilty/Acquittal"), 1, 0), .by = docket_number)
  
  return(df)
}


## Get Current Probation
# Pulls probation sentence info from DAOCMS. Then gets indicator for whether defendant is 
# already on probation at the time current case is opened.
# Logic is: by defendant, get max probation end date for all cases opened prior to the
# current case open date, then see if current case open date is before prior probation end date
getCurrentProbation <- function(df){
  
  # load case listings with sentences to get probation info
  case_listings_with_sentences <- load_case_listings_with_sentences() %>%
    mutate(dc_pid = paste0(dc_number,";",defendant_pid)) %>%
    mutate(any_probation = ifelse(any(has_probation) == TRUE, 1, 0), .by = dc_pid) %>%
    mutate(unit_probation = max(probation_max, na.rm = T), .by = dc_pid) %>%
    select(dc_pid, any_probation, unit_probation) %>%
    distinct(dc_pid, .keep_all = T)
  
  # link to cases by dc-pid
  cases <- load_cases() %>%
    mutate(dc_pid = paste0(dc_number,";",defendant_pid)) %>%
    select(docket_number, dc_pid, defendant_pid, otn, dc_number, case_open_date, disposition_date) %>%
    left_join(., case_listings_with_sentences, by = "dc_pid") %>%
    mutate(probation_days = ifelse(any_probation == 1, unit_probation*31, NA)) %>%
    mutate(probation_end = disposition_date + probation_days) %>%
    # keep max probation end date by dc-pid (because MC case ends before CP case, we want date from CP)
    mutate(probation_end_max = max(probation_end), .by = dc_pid) %>%
    # keep just MC case if the case was transferred (do this by keeping first case open date per dc-pid)
    arrange(defendant_pid, case_open_date) %>%
    distinct(dc_pid, .keep_all = TRUE) %>%
    # get marker if defendant was on probation when current case was opened
    select(docket_number, dc_pid, defendant_pid, case_open_date, probation_days, probation_end_max) %>%
    # identify first instance of probation
    mutate(row_num = row_number(), .by = defendant_pid) %>%
    mutate(first_probation = ifelse(!is.na(probation_end_max), row_num, 0),
            .by = defendant_pid) %>%
    mutate(after_first_probation = ifelse(row_num == first_probation, 1, NA)) %>%
    # group by PID (fill() no longer supports .by option)
    group_by(defendant_pid) %>%
    fill(after_first_probation, .direction = "down") %>%
    ungroup() %>%
    # keep just obs after first probation to calculate future probation dates
    filter(after_first_probation == 1) %>%
    # fill in probation dates before getting cumulative max end date
    group_by(defendant_pid) %>%
    fill(probation_end_max, .direction = "down") %>%
    # cummax doesn't work with dates, so switch to integer and back again w/ as.Date 1970 reference date
    mutate(max_end_date = as.Date(cummax(as.integer(probation_end_max)), "1970-01-01")) %>%
    # finally, get prior probation end date by taking lag (don't include current case)
    mutate(prior_probation_end_date = lag(max_end_date)) %>%
    ungroup() %>%
    select(dc_pid, prior_probation_end_date)
  
  # merge prior probation variable to main dataset
  df <- df %>%
    mutate(dc_pid = paste(dc_number, defendant_pid, sep = ";")) %>%
    left_join(., cases, by = "dc_pid") %>%
    mutate(on_probation = case_when(case_open_date <= prior_probation_end_date ~ 1,
                                      is.na(prior_probation_end_date) ~ 0, 
                                      TRUE ~ 0))
  
  return(df)
}

## Get Shift Variables
# creates time / shift variables
getShifts <- function(df){
  
  df <- df %>%
    mutate(month = as.factor(month(listing_date)),
           week = as.factor(week(listing_date)),
           day = as.factor(wday(listing_date)), # NOTE: wday = day of week.
           year = as.factor(listing_year),
           court_room_id = as.factor(court_room_id)) %>%
    mutate(listing_hour = gsub(":[0-9][0-9]$",":00",listing_time))
  
  return(df)
}

## Get Offense Indicators
# creates binary crime type variables and a less granular crime category variable
# also makes other crime type variables into factors for ease of use in analyses
getOffenseIndicators <- function(df){
  
  df <- df %>%
    mutate(drug = ifelse(crimebigcat_charging == "Drugs",1,0),
           public_order = ifelse(crimebigcat_charging == "Public Order",1,0),
           dui = ifelse(crimebigcat_charging == "DUI",1,0),
           violent = ifelse(crimebigcat_charging == "Violent",1,0),
           property = ifelse(crimebigcat_charging == "Property",1,0),
           other = ifelse(crimebigcat_charging == "Other",1,0)) %>%
    mutate(crimebigcat_charging = factor(crimebigcat_charging, levels = c("Other","Violent","Property","Drugs","DUI","Public Order"))) %>%
    mutate(lead_charge_offense_code_with_subsection = as.factor(lead_charge_offense_code_with_subsection)) %>%
    mutate(lead_charge_grade_atCU_levels = as.factor(lead_charge_grade_atCU_levels)) %>%
    mutate(is_dv = as.factor(is_dv))
  
  return(df)
}

  
## Get Defendant Indicators
# creates binary defendant demographic variables 
getDefendantIndicators <- function(df){
  
  df <- df %>%
    mutate(male = ifelse(def_gendercat == "Male",1,0)) %>%
    mutate(black = ifelse(grepl("Black",defendant_race_PARS),1,0),
           hisp = ifelse(grepl("Latino",defendant_race_PARS),1,0)) 
  return(df)
}


## Get Hearing outcomes
# gets indicators for the following hearing-level outcomes:
# return FTA, marked MBT, disco marked closed, disco incomplete
getHearingOutcomes <- function(df){
  
  df <- df %>%
    # Commonwealth FTA (FTAs that ADAs are responsible for)
    mutate(cw_fta_athearing = ifelse((victim_fta==1|officer_fta==1|witness_fta==1|lpo_fta==1),1,0))
  
  temp <- df %>%
    distinct(docket_day, .keep_all = TRUE) %>%
    arrange(docket_number, listing_date) %>%
    mutate(last_hearing = ifelse(listing_number == max(listing_number), 1, 0), .by = docket_number) %>%
    #(if no lead, return FTA will be NA-- makes sense since there was no need for return subpoenas if the case was closed)
    mutate(return_fta = ifelse(lead(cw_fta_athearing == 1), 1, 0), .by = docket_number) %>%
    # keep just first instance of case being marked MBT (want to ID first ADA who got it marked, not all those after)
    mutate(marked_mbt = ifelse(marked_mbt == 1 & lag(marked_mbt == 1), 0, marked_mbt), .by = docket_number) %>%
    mutate(disco_marked_closed = ifelse(grepl("discov?e?r?y? ?(is|passed)? ?(complete|closed?)", docket_entry_comments_collapse), 1, 0)) %>%
    select(docket_number, docket_day, return_fta, marked_mbt, disco_marked_closed)
  
  df <- df %>%
    left_join(., temp) %>%
    # fill in return_fta, marked_mbt and disco variables (they are NA if listing was last listing in case, should be 0)
    mutate(marked_mbt = ifelse(is.na(marked_mbt), 0, marked_mbt),
           return_fta = ifelse(is.na(return_fta), 0, return_fta),
           disco_marked_closed = ifelse(is.na(disco_marked_closed), 0, disco_marked_closed)) %>%
    mutate(disco_incomplete = ifelse(grepl("discov?e?r?y? ?(is)? ?incomplete", docket_entry_comments_collapse)|
                                       grepl("No Seizure Analysis|Discovery|Further Investigation",event_disp_reason), 1, 0)) %>%
    # count hearings with CW FTA as also CW Not Ready 
    mutate(cw_notready_combo = ifelse(cw_notready == 1 | cw_fta_athearing == 1 | disco_incomplete == 1, 1, 0))
  
  return(df)
}


#-------------------------------------------------------------------------------#
# BUILD DATASET ----------------------------------------------------------------#
#-------------------------------------------------------------------------------#

df <- merge_cl_de() %>%
  # get cases with full information
  getSharedCases(.) %>%
  # get lead charge info
  getLeadCharges(.) %>%
  # limit dataset to misdemeanor lead charge cases disposed in Municipal Courts
  filter(startsWith(docket_number, "MC51CR") & lead_charge_grade_atCU == "M") %>%
  filter(dispo_type != "Non-Disposition") %>%
  # collapse notes to hearing level
  collapseDocketEntries(.) %>% 
  # get relevant hearings (do before handling diversions bc that code relies on post-bail listing number)
  getRelevantHearings(.) %>%
  # get some extra variables from cases df DAOCMS, filter to cases after 2010
  getCasesVars(.) %>%
  # remove cases disposed after pandemic and cases in 2010 because ADA data not good back then
  # also allows for correct opportunity time frames for priors and recidivism variables
  filter(disposition_date < date("2020-03-01")) %>%
  filter(case_open_date >= "2011-01-01") %>%  
  # Get NIBRS / UCR offense information
  cleanStatutes(.) %>%
  get_nibrs_ucr_cats(as.data.frame(.),
                     title_var = statute_title,
                     section_var = statute_section,
                     subsection_var = statute_subsection,
                     desc_var = lead_charge_description,
                     add_flag = FALSE) %>%
  # get crime categories (at disposition and charging)
  getCrimeCat(.) %>%
  getLeadOffAtCharging(.) %>%
  # drop one case that is missing defendant IDs
  filter(!is.na(defendant_pid)) %>%
  # remove cases diverted at charging & make corrections to remaining diversion cases
  removeChargingDiversions(.) %>%
  correctDiversionDispos(.) %>%
  getDiversionEndDate(.) %>%
  # also get 404 withdrawals
  get404Vars(.) %>%
  # drop cases that were diverted even if they later came back to MC (missing docket entries for diversion cases)
  dropDiversionsAlternate(.) %>%
  # identify FTAs & witness FTA for CW or Defense
  getFTA(., docket_entry_colname = "docket_entry_comments_collapse") %>%
  getWitnessFTAType(.) %>%
  # remove FTA columns we don't need
  select(-ends_with("fta_clean"), -ends_with("fta_junk")) %>%
  # Fill in NAs in officer and witness FTA variables
  mutate(., across(ends_with("_fta"), ~ifelse(is.na(.x),0,.x))) %>%
  getVictimInfo(.) %>%
  getWitnessInfo(.) %>%
  getDefendantInfo(.) %>%
  getAttyInfo(.) %>%
  # update victim FTA information based on victim / witness cases
  updateVictimFTAs(.) %>%
  getCaseFTAInfo(.) %>%
  # Get priors, recidivism, etc.
  getDispoVars(.) %>%
  getSentencesDAOCMS(.) %>%
  getCurrentProbation(.) %>%
  getPriors(.) %>%
  getPriorArrest(.) %>%
  getRecid(.) %>%
  getRecidArrest(.) %>%
  getChargingVars(.) %>%
  getPretrialReleaseStatus(.) %>%
  getCaseLength(.) %>%
  # Create variables for analyses (makes lots of indicator variables)
  getOffenseIndicators(.) %>%
  getDefendantIndicators(.) %>%
  getShifts(.) %>%
  # get ADA names at each hearing
  mergeADAs(., getADAIdentifiers()) %>%
  cleanADAdockets(.) %>%
  getJudges(.) %>%
  getHearingOutcomes(.) 

saveRDS(df, file = "/srv/data/penn/what_makes_an_effective_prosecutor/final_benchmark_df.rds")


#-------------------------------------------------------------------------------#
# GET SAMPLE SELECTION TABLE ---------------------------------------------------#
#-------------------------------------------------------------------------------#

# Helper function to get summary stats for a given sample
getSummaryTab <- function(sample){
  
  obs <- sample %>%
    distinct(id_dlc) %>%
    summarize(Hearings = n())
  
  ada_obs <- nrow(sample)
   
  tab <- sample %>%
    distinct(docket_number, .keep_all = T) %>%
    mutate(is_dv = ifelse(is_dv == TRUE, 1, 0)) %>%
    summarize(Male = round(mean(male, na.rm = T), digits = 2),
              Black = round(mean(black), digits = 2),
              Hispanic = round(mean(hisp), digits = 2),
              `Defendant Age at Arrest` = round(mean(def_age_arrest,na.rm = T), digits = 2),
              `Violent` = round(mean(violent), digits = 2),
              `Drugs` = round(mean(drug), digits = 2),
              `DUI` = round(mean(dui), digits = 2),
              `Property` = round(mean(property), digits = 2),
              `Public Order` = round(mean(public_order), digits = 2),
              `DV` = round(mean(is_dv), digits = 2),
              `Number of Current Charges` = round(mean(num_charges, na.rm = T), digits = 2),
              `Prior Arrest in Past Year` = round(mean(priorarrest_past_year), digits = 2),
              `Prior Misdemeanor Conviction` = round(mean(prior_misd_conviction), digits = 2),
              `Prior Felony Conviction` = round(mean(prior_fel_conviction), digits = 2),
              `Prior Violent Conviction` = round(mean(prior_violent_conviction), digits = 2),
              `Prior Incarceration` = round(mean(prior_incarceration), digits = 2),
              `Number of Prior Arrests` = round(mean(number_of_prior_arrests,na.rm = T), digits = 2),
              `Pending Case at Open` = round(mean(pending_case_at_open), digits = 2),
              `Bail Posted Within 3 Days` = round(mean(bail_posted_in_3days, na.rm = T), digits = 2),
              Cases = n()) %>%
    mutate(Hearings = as.numeric(obs),
           `ADA Hearings` = as.numeric(ada_obs)) %>%
    pivot_longer(.,
                 cols = 1:ncol(.),
                 names_to = "variable",
                 values_to = "mean")
  
  return(tab)
}

# Function to get a table showing sample selection / where cases are 
# dropped at each stage of the process.
getSampleSelectionTable <- function(){
  
  # COLUMN 1: Misdemeanor lead charge cases from 2011 to March 2020
  all_misdemeanor_cases <- readRDS("/srv/data/penn/court_actor_FTA/CLDEmerged.rds") %>%
    # get cases with full information
    getSharedCases(.) %>%
    getLeadCharges(.) %>%
    # limit dataset to misdemeanor lead charge cases disposed in Municipal Courts
    filter(startsWith(docket_number, "MC51CR") & lead_charge_grade_atCU == "M") %>%
    filter(dispo_type != "Non-Disposition") %>% 
    collapseDocketEntries(.) %>% 
    getRelevantHearings(.) %>%
    getCasesVars(.) %>%
    # remove cases disposed after pandemic and cases in 2010 because ADA data not good back then
    # also allows for correct opportunity time frames for priors and recidivism variables
    filter(disposition_date < date("2020-03-01")) %>%
    filter(case_open_date >= "2011-01-01") %>%  
    cleanStatutes(.) %>%
    get_nibrs_ucr_cats(as.data.frame(.),
                       title_var = statute_title,
                       section_var = statute_section,
                       subsection_var = statute_subsection,
                       desc_var = lead_charge_description,
                       add_flag = FALSE) %>%
    getCrimeCat(.) %>%
    getLeadOffAtCharging(.) %>%
    # drop one case that is missing defendant IDs
    filter(!is.na(defendant_pid)) %>%
    getDefendantInfo(.) %>%
    getDefendantIndicators(.) %>%
    getOffenseIndicators(.) %>%
    getPriors(.) %>%
    getPriorArrest(.) %>%
    getPretrialReleaseStatus(.) 
  
  # COLUMN 4: Main Analysis Sample
  main_sample <- readRDS("/srv/data/penn/what_makes_an_effective_prosecutor/final_benchmark_df.rds") %>%
    # Keep only listings in MC room
    filter(court_room_num %in% c("403","406","503","506","603","606","703","706","803","806","903","906","1103")) %>%
    # keep hearings with at least one ADA identified and ADAs with caseloads > 50
    filter(!is.na(ada_name)) %>%
    mutate(caseload = n(), .by = ada_name) %>%
    filter(caseload >= 50) %>%
    # drop levels of factor ADA names that don't exist after filtering
    droplevels(.) %>%
    # need key for unique docket-listing-adas 
    mutate(docket_listing_ada = paste(docket_number, listing_number, ada_number)) %>%
    # drop if outcomes are missing (drops one case) 
    filter(!is.na(recid_win_2yr_fromOGopen))
  
  # drop a few hearings with unclear / conflicting court and judge information (need distinct ADA-docket-listings)
  i <- which(duplicated(main_sample$docket_listing_ada))
  a <- main_sample[i,]
  main_sample <- main_sample %>% 
    filter(!docket_listing_ada %in% a$docket_listing_ada)

  # COLUMN 2: All Other Excluded Hearings / Cases (due to diversion, caseload, etc.)
  excluded_cases <- all_misdemeanor_cases %>%
    filter(!docket_number %in% main_sample$docket_number) 
  
  # COLUMN 3: Missing ADA Identification 
  missing_ADA <- excluded_cases %>%
    # get ADA names at each hearing
    mergeADAs(., getADAIdentifiers()) %>%
    cleanADAdockets(.) %>%
    filter(court_room_num %in% c("403","406","503","506","603","606","703","706","803","806","903","906","1103")) %>%
    filter(is.na(ada_name))
  
  # Remove those missing ADA identification from other excluded cases bucket
  excluded_cases <- excluded_cases %>%
    filter(!docket_number %in% missing_ADA$docket_number) 
  
  # Create table showing progression of build
  tab1 <- all_misdemeanor_cases %>%
    getSummaryTab(.) %>%
    rename() %>%
    rename(`All Misdemeanor Cases` = mean)
  
  tab2 <- excluded_cases %>%
    getSummaryTab(.) %>%
    rename() %>%
    rename(`Excluded Cases` = mean)
  
  tab3 <- missing_ADA %>%
    getSummaryTab(.) %>%
    rename() %>%
    rename(`Missing ADA Identification` = mean)
  
  tab4 <- main_sample %>%
    getSummaryTab(.) %>%
    rename() %>%
    rename(`Main Sample` = mean)

  tab <- left_join(tab1, tab2) %>%
    left_join(., tab3) %>%
    left_join(., tab4) %>%
    kable(caption = "Progression of Sample Build", booktabs = T, format = "latex") %>%
    kable_styling(latex_options = "striped") 
  
  return(tab)
}

tabA1 <- getSampleSelectionTable()


